
# coding: utf-8

import numpy as np
import pandas as pd
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

fe_dst_path = 'data/train_tiny_new.csv'

fe_gbdt_path = "data/FE_gbdt_data.csv"


def listNormalize(X):
    """Scaling features to a range [0, 1]"""
    X_np = np.array(X)

    X_norm = [(x - X_np.min())/(X_np.max() - X_np.min()) for x in X_np]

    return X_norm


train_data = pd.read_csv(fe_dst_path)
train_data.info()


#One Hot Encoder
x_col = [x for x in train_data.columns if x not in ['id', 'click']]

feature_dict = {}
for feature in x_col:
    #print(feature)
    feature_le = LabelEncoder()
    feature_labels = feature_le.fit_transform(train_data[feature])

    feature_ohe = OneHotEncoder()
    feature_arr = feature_ohe.fit_transform(feature_labels.reshape(-1, 1)).toarray()
    new_feature_labels = ["%s_%s" % (feature, x) for x in feature_le.classes_]

    new_features = pd.DataFrame(feature_arr, columns=new_feature_labels)
    feature_dict[feature] = new_features

ohe_data = pd.concat([feature_dict[x] for x in feature_dict.keys()], axis=1)
ohe_data.shape

#GBDT
X = ohe_data
y = train_data['click']

gbm = GradientBoostingClassifier(random_state=10, n_estimators=10, max_depth=5)
gbm.fit(X,y)

test = gbm.apply(X)

gbdt_feature_dict = {}

for i in range(len(test[0])):
    feature_name = "gbdt_"+str(i)
    gbdt_feature_dict[feature_name] = listNormalize([x[0] for x in test[::, i]])

gbdt_feature = pd.DataFrame(gbdt_feature_dict, index=range(len(test[::,0])))
gbdt_feature.shape

new_data = pd.concat([train_data.id, train_data.click, ohe_data, gbdt_feature], axis=1)
new_data.to_csv(fe_gbdt_path, index=False)
new_data.shape

